#######################################################################################
# Downloads and cleans Google Ngram annual counts for each language
#
# It was run with Julia 0.6 and requires that the PyCall Julia package is installed,
# as well as the google-ngram-downloader python package.
#######################################################################################

using PyCall

# requires that the google-ngram-downloader python package is installed
# e.g. pip install google_ngram_downloader
@pyimport google_ngram_downloader as gnd

# download ngrams of length `ngram_len` for language `lang`
function gbndownload(ngram_len,lang)
  gbn5 = gnd.util[:iter_google_store](ngram_len=ngram_len, lang=lang, verbose=true, getrequest=false)
  for r in gbn5
    fname, url, request = r
    if !isfile(joinpath(gndir,fname))
      try
        download(url,joinpath(gndir,fname))
      catch e
        warn("failed to download $fname from $url")
      end
    end
  end
end

# clean totalcounts files
function cleantotal(lang)
  totalsfile = joinpath(totalsdir,"googlebooks-$lang-all-totalcounts-20120701.txt")
  totalsfile2 = joinpath(totalsdir,"googlebooks-$lang-all-totalcounts-20120701-cleaned.txt")
  s = open(totalsfile, "r") do file
      read(file, String)
  end
  s = replace(s," \t","")
  s = replace(s,"\t","\n")
  open(totalsfile2, "w") do file
      write(file, s)
  end
end












#
